{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " # HTML解析-南方学院新闻 & liepin实践\n",
    "\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " # 上周回顾及翻页思考"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/xydt/index.htm\")\n",
    " \n",
    "r.status_code"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " ## html 页面数据的存与读"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_文学与传媒学院.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " ## 解析和重塑链接（内容链接）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x17cd03463b0>"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "parsed = requests_html.soup_parse(html_load)\n",
    "parsed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## soup_html 解析 ： str的html文件 => element html元素文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/xydt/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/xydt/cf4420785b9046e99851413a1fb1b6f7.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/935f580040704990a4e396fa8091ee30.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebfe79cd5d66e288b.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/9611d110ec8a486587ab4020171ee9f5.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/2f3dcc0f4400419e8e42af09fde3c251.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c4217dd13f8f8ec.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/a9523b72a34e4143afa9b38879ecba0c.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/f8434c9f092348c2a4a3c1ac8727a8fd.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/d60d33983337463390ef99385afce119.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/4b0c8e69b5074d28badd20cb55436009.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/bdda4dfbda944a3eb84612b7045620f4.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/96e05388e3fa43de9a8f446b083875f8.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/25dc7cb574284be18a6c9a0640e5aca3.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/b9777111c2194e7b85143431ab4706a7.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/c3846031b4c0444a99e0dfd90047c046.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/5e165fdaee834899891ba5b3eea69bc9.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/76d9a219517e4a2b8c28135b51f55e3b.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/bf6e98d6027544809e35f6f8d5ea5c3a.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/874bf131af7746d888ec1801eb31c064.htm',\n",
       " 'https://www.nfu.edu.cn/xydt/30f39cccbda84ecb92013ecfcc410ff6.htm']"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>商学院电子商务专业召开申请调整学位授予学科门类 专家评审会</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/cf4420785b9046e998...</td>\n",
       "      <td>2021-04-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>广州新华学院会计学院刘运国院长一行莅临我院访问</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/935f580040704990a4...</td>\n",
       "      <td>2021-04-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>会计学院大一年级大会顺利召开</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebf...</td>\n",
       "      <td>2021-04-06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>【国奖映像】蒋晓琳：明确目标，为之努力</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/9611d110ec8a486587...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>“远离糖尿病，筑起健康防线” 护理与健康学院寒假社会实践调查成果汇报展示圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/2f3dcc0f4400419e8e...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>励能计划2021：你选哪一项？</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>党团同行传薪火，红色循迹筑初心——商学院党团同行重走“东江纵队”红色之路</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/a9523b72a34e4143af...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>广州南方学院老年与慢病护理研究中心学术沙龙系列第3期圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/f8434c9f092348c2a4...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>凝心聚力 共谱新篇——商学院召开新学期全体教职工大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/d60d33983337463390...</td>\n",
       "      <td>2021-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>青马工程｜“回首峥嵘岁月，领悟红船精神”——商学院百年党史宣讲活动</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/4b0c8e69b5074d28ba...</td>\n",
       "      <td>2021-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>护理与健康学院“教学相长，从教学中成长”暨青年教师专题讲座与交流活动圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/bdda4dfbda944a3eb8...</td>\n",
       "      <td>2021-04-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>党建观摩拓思路，对照标杆“取真经”——我校商学院师生团队赴广东外语外贸大学南国商学院管理学院...</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/96e05388e3fa43de9a...</td>\n",
       "      <td>2021-03-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>护理与健康学院直属党支部2021年春季入党积极分子培训圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/25dc7cb574284be18a...</td>\n",
       "      <td>2021-03-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>成长在文传我院举行2021年春季学期全体教职工大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/b9777111c2194e7b85...</td>\n",
       "      <td>2021-03-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>倾心指导促研学，凝心聚力谋发展——大英中心教学研究座谈会顺利召开</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/c3846031b4c0444a99...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>护理与健康学院第37期师生面对面顺利开展</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/5e165fdaee83489989...</td>\n",
       "      <td>2021-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>凝聚共识谋规划， 师生同心促发展—— 电气与计算机工程学院召开全体教职工大会</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/76d9a219517e4a2b8c...</td>\n",
       "      <td>2021-03-25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>护理与健康学院“开卷行之”征文比赛圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/bf6e98d6027544809e...</td>\n",
       "      <td>2021-03-24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>青春心向党，奋进新时代——护理与健康学院院长“思政第一课”顺利开讲</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/874bf131af7746d888...</td>\n",
       "      <td>2021-03-23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>护理与健康学院召开2021年春季工作会议</td>\n",
       "      <td>https://www.nfu.edu.cn/xydt/30f39cccbda84ecb92...</td>\n",
       "      <td>2021-03-22</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   标题  \\\n",
       "0                       商学院电子商务专业召开申请调整学位授予学科门类 专家评审会   \n",
       "1                             广州新华学院会计学院刘运国院长一行莅临我院访问   \n",
       "2                                      会计学院大一年级大会顺利召开   \n",
       "3                                 【国奖映像】蒋晓琳：明确目标，为之努力   \n",
       "4            “远离糖尿病，筑起健康防线” 护理与健康学院寒假社会实践调查成果汇报展示圆满结束   \n",
       "5                                     励能计划2021：你选哪一项？   \n",
       "6                党团同行传薪火，红色循迹筑初心——商学院党团同行重走“东江纵队”红色之路   \n",
       "7                      广州南方学院老年与慢病护理研究中心学术沙龙系列第3期圆满结束   \n",
       "8                          凝心聚力 共谱新篇——商学院召开新学期全体教职工大会   \n",
       "9                   青马工程｜“回首峥嵘岁月，领悟红船精神”——商学院百年党史宣讲活动   \n",
       "10             护理与健康学院“教学相长，从教学中成长”暨青年教师专题讲座与交流活动圆满结束   \n",
       "11  党建观摩拓思路，对照标杆“取真经”——我校商学院师生团队赴广东外语外贸大学南国商学院管理学院...   \n",
       "12                    护理与健康学院直属党支部2021年春季入党积极分子培训圆满结束   \n",
       "13                          成长在文传我院举行2021年春季学期全体教职工大会   \n",
       "14                   倾心指导促研学，凝心聚力谋发展——大英中心教学研究座谈会顺利召开   \n",
       "15                               护理与健康学院第37期师生面对面顺利开展   \n",
       "16             凝聚共识谋规划， 师生同心促发展—— 电气与计算机工程学院召开全体教职工大会   \n",
       "17                              护理与健康学院“开卷行之”征文比赛圆满结束   \n",
       "18                  青春心向党，奋进新时代——护理与健康学院院长“思政第一课”顺利开讲   \n",
       "19                               护理与健康学院召开2021年春季工作会议   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   https://www.nfu.edu.cn/xydt/cf4420785b9046e998...  2021-04-07  \n",
       "1   https://www.nfu.edu.cn/xydt/935f580040704990a4...  2021-04-06  \n",
       "2   https://www.nfu.edu.cn/xydt/16f4c5f4bd284caebf...  2021-04-06  \n",
       "3   https://www.nfu.edu.cn/xydt/9611d110ec8a486587...  2021-04-02  \n",
       "4   https://www.nfu.edu.cn/xydt/2f3dcc0f4400419e8e...  2021-03-29  \n",
       "5   https://www.nfu.edu.cn/xydt/8b2414ee7cca45d88c...  2021-04-02  \n",
       "6   https://www.nfu.edu.cn/xydt/a9523b72a34e4143af...  2021-04-02  \n",
       "7   https://www.nfu.edu.cn/xydt/f8434c9f092348c2a4...  2021-04-02  \n",
       "8   https://www.nfu.edu.cn/xydt/d60d33983337463390...  2021-04-01  \n",
       "9   https://www.nfu.edu.cn/xydt/4b0c8e69b5074d28ba...  2021-04-01  \n",
       "10  https://www.nfu.edu.cn/xydt/bdda4dfbda944a3eb8...  2021-04-01  \n",
       "11  https://www.nfu.edu.cn/xydt/96e05388e3fa43de9a...  2021-03-30  \n",
       "12  https://www.nfu.edu.cn/xydt/25dc7cb574284be18a...  2021-03-30  \n",
       "13  https://www.nfu.edu.cn/xydt/b9777111c2194e7b85...  2021-03-29  \n",
       "14  https://www.nfu.edu.cn/xydt/c3846031b4c0444a99...  2021-03-26  \n",
       "15  https://www.nfu.edu.cn/xydt/5e165fdaee83489989...  2021-03-26  \n",
       "16  https://www.nfu.edu.cn/xydt/76d9a219517e4a2b8c...  2021-03-25  \n",
       "17  https://www.nfu.edu.cn/xydt/bf6e98d6027544809e...  2021-03-24  \n",
       "18  https://www.nfu.edu.cn/xydt/874bf131af7746d888...  2021-03-23  \n",
       "19  https://www.nfu.edu.cn/xydt/30f39cccbda84ecb92...  2021-03-22  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 持久化存储"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# B-D-2 pd.DataFrame 输出excel，pandas课有教\n",
    "df.to_excel(\"data_out/nfu_文学与传媒学院2.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
